#load relevant libraries
library(dplyr)
library(ggplot2)
#Import data into a new data frame
MasterData<-read.csv("~/downloads/Data Summary_CLEAN.csv")
#eliminating redundant columns
MasterData<-MasterData[,1:12]
#Changing column names
names(MasterData) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS")
#Examine scatter plots for different variable combination pairs
#Creating scatter plots for all pairs
pairs(MasterData)

#Time spent on Connect homework, Percentage of correct homework responses
ggplot(MasterData, aes(Time_hw,Per_correct_hw))+ geom_point() + stat_sum(aes(group = 1))

#Time spent on Connect homework, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(Time_hw,Time_LS))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of correct homework responses, Final course grade
ggplot(MasterData, aes(Per_correct_hw,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of correct LearnSmart responses, Percentage of awareness of correct/incorrect responses
ggplot(MasterData, aes(Per_correct,Per_aware))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of completion of assigned LearnSmart activities, Final course grade
ggplot(MasterData, aes(Per_complete_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of completion of assigned LearnSmart activities, Percentage of incorrect and unaware LearnSmart responses
ggplot(MasterData, aes(Per_complete_LS,IU))+ geom_point() + stat_sum(aes(group = 1))

#Time spent on LearnSmart adaptive activities, Final course grade
ggplot(MasterData, aes(Time_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Final course grade
ggplot(MasterData, aes(IU,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(IU,Time_LS))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Time spent on Connect homework
ggplot(MasterData, aes(IU,Time_hw))+ geom_point() + stat_sum(aes(group = 1))

#Create new data frame for scaled data, removing non-clustering variables
ScaledData<-as.data.frame(cbind(MasterData[,1:3],scale(select(MasterData, Final_grade:Per_complete_LS))))
#Calculate Mean and Standard Deviation for scaled data
sapply(ScaledData[,4:12], mean)
Final_grade IU Per_correct Per_aware Time_hw Time_pron Time_LS
5.843135e-17 -1.133857e-16 9.600179e-17 -4.490148e-16 -3.776554e-17 5.052708e-17 8.279565e-17
Per_correct_hw Per_complete_LS
2.015534e-16 4.870925e-18
sapply(ScaledData[,4:12], sd)
Final_grade IU Per_correct Per_aware Time_hw Time_pron Time_LS
1 1 1 1 1 1 1
Per_correct_hw Per_complete_LS
1 1
#Creating the Within Group Sum of Squares function
wssplot <- function(data, nc=15, seed=1234){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
#Create the WSS Graph
wssplot(ScaledData[,4:12],nc=15,seed=1234)

#Create and examine several different possible cluster solutions
threeclusterkmeans<-kmeans(ScaledData[,4:12], 3, nstart=10)
threeclusterkmeans
fourclusterkmeans<-kmeans(ScaledData[,4:12], 4, nstart=10)
fourclusterkmeans
#Assign the clusters for each observation for k=3,4,5 to a new dataframe
Clusters<-data.frame(MasterData, threeclusterkmeans$cluster, fourclusterkmeans$cluster, fiveclusterkmeans$cluster)
Error in data.frame(MasterData, threeclusterkmeans$cluster, fourclusterkmeans$cluster, :
object 'fiveclusterkmeans' not found
#Graph the different solutions - Three Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language"
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Time_LS, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Time_hw, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

#Graph the different solutions - Four Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Time_LS, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Time_hw, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

#Graph the different solutions - Five Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
---
title: "R Notebook"
output: html_notebook
---
  
```{r}
#load relevant libraries
library(dplyr)
library(ggplot2)

#Import data into a new data frame
MasterData<-read.csv("~/downloads/Data Summary_CLEAN.csv")
#eliminating redundant columns
MasterData<-MasterData[,1:12]
#Changing column names
names(MasterData) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS")

```

```{r}
#Examine scatter plots for different variable combination pairs
#Creating scatter plots for all pairs
pairs(MasterData)
#Time spent on Connect homework, Percentage of correct homework responses
ggplot(MasterData, aes(Time_hw,Per_correct_hw))+ geom_point() + stat_sum(aes(group = 1))
#Time spent on Connect homework, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(Time_hw,Time_LS))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of correct homework responses, Final course grade
ggplot(MasterData, aes(Per_correct_hw,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of correct LearnSmart responses, Percentage of awareness of correct/incorrect responses
ggplot(MasterData, aes(Per_correct,Per_aware))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of completion of assigned LearnSmart activities, Final course grade
ggplot(MasterData, aes(Per_complete_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of completion of assigned LearnSmart activities, Percentage of incorrect and unaware LearnSmart responses
ggplot(MasterData, aes(Per_complete_LS,IU))+ geom_point() + stat_sum(aes(group = 1))
#Time spent on LearnSmart adaptive activities, Final course grade
ggplot(MasterData, aes(Time_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Final course grade
ggplot(MasterData, aes(IU,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(IU,Time_LS))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Time spent on Connect homework
ggplot(MasterData, aes(IU,Time_hw))+ geom_point() + stat_sum(aes(group = 1))
```

```{r}
#Create new data frame for scaled data, removing non-clustering variables
ScaledData<-as.data.frame(cbind(MasterData[,1:3],scale(select(MasterData, Final_grade:Per_complete_LS))))
#Calculate Mean and Standard Deviation for scaled data
sapply(ScaledData[,4:12], mean)
sapply(ScaledData[,4:12], sd)
```

```{r}
#Creating the Within Group Sum of Squares function
wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

#Create the WSS Graph
wssplot(ScaledData[,4:12],nc=15,seed=1234)
```
```{r}
#Create and examine several different possible cluster solutions
threeclusterkmeans<-kmeans(ScaledData[,4:12], 3, nstart=10)
threeclusterkmeans
fourclusterkmeans<-kmeans(ScaledData[,4:12], 4, nstart=10)
fourclusterkmeans
```
```{r}
#Assign the clusters for each observation for k=3,4 to a new dataframe
Clusters<-data.frame(ScaledData, threeclusterkmeans$cluster, fourclusterkmeans$cluster)
```

```{r}
#Graph the different solutions - Three Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language"
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
```

```{r}
#Graph the different solutions - Four Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
```
```{r}
#Graph the different solutions - Five Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))